Simulated UKB data including disease status, risk factor and
proteomics
The risk factors names are listed from the Stroke related ones.
Record the summarized mean and standard deviance for each risk factor,
including whether the predictors are binary or continuous.
risk_factor_names<-c("sex","age","dbp","sbp","T2D","height","BMI","cholesterol","HDL","LDL","crp","pulse_rate","trig","hyperlipid","SmokingStatus","AlcoholFreq","PackYrs","Basal_metabolic_rate","Aspirin","statin","father.Stroke","mother.Stroke","physical","edu","social","diet")
mean_risk_factor<-c(0.4544,56.7359,82.1773,136.0661,0.0479,168.6782,27.3743,221.0858,56.2353,138.1562,2.5619,69.4559,154.7082,0.8422,0.5607,2.8560,10.5340,6626.7823,0.1363,0.1737,0.1359,0.1314,2669.5590,0.3277,0.7016,0.7360)
sd_risk_factor<-c(0.4979,8.0211,9.9597,18.1335,0.2135,9.2446,4.7546,43.6913,14.0612,33.2142,4.2946,11.2386,89.9247,0.3645,0.6716,1.4821,15.4259,1358.2518,0.3431,0.3788,0.3427,0.3379,2453.7016,0.4694,0.4576,0.5951)
names(mean_risk_factor)<-risk_factor_names
print(mean_risk_factor)
## sex age dbp
## 0.4544 56.7359 82.1773
## sbp T2D height
## 136.0661 0.0479 168.6782
## BMI cholesterol HDL
## 27.3743 221.0858 56.2353
## LDL crp pulse_rate
## 138.1562 2.5619 69.4559
## trig hyperlipid SmokingStatus
## 154.7082 0.8422 0.5607
## AlcoholFreq PackYrs Basal_metabolic_rate
## 2.8560 10.5340 6626.7823
## Aspirin statin father.Stroke
## 0.1363 0.1737 0.1359
## mother.Stroke physical edu
## 0.1314 2669.5590 0.3277
## social diet
## 0.7016 0.7360
We assume all binary risk factors follow binomial distribution and
continuous risk factors follow normal distribution. We follow the same
disease prevalence of Stroke from UKB. The main study is assumed to be
10K, and the external study is 9 times more, where the ratio is similar
to actual case.
Although there are in total 15K proteins in proteomics data, we
assume there are 200 proteins in simulation.
binary_predictors<-c("sex","T2D","hyperlipid","SmokingStatus","Aspirin","statin","father.Stroke","mother.Stroke","edu","social","diet")
n_main<-10000
n_external<-90000
n_joint<-n_main+n_external
p_protein<-200
risk_factor<-list()
for( i in which(risk_factor_names%in%binary_predictors)){
risk_factor[[i]]<-rbinom(n = n_joint,size = 1,prob = mean_risk_factor[i])
}
for( i in which(!risk_factor_names%in%binary_predictors)){
risk_factor[[i]]<-rnorm(n = n_joint,mean = mean_risk_factor[i],sd = sd_risk_factor[i])
}
risk_factor_matrix<-Reduce("cbind",risk_factor)
Check the data structure of the risk factor matrix, which is similar
to the actual data structure.
colnames(risk_factor_matrix)<-risk_factor_names
round(head(risk_factor_matrix,2),2)
## sex age dbp sbp T2D height BMI cholesterol HDL LDL crp
## [1,] 1 52.63 80.27 165.16 0 181.90 22.81 253.29 60.89 129.16 0.46
## [2,] 0 57.37 84.08 129.72 0 161.01 30.94 210.45 61.29 102.46 0.58
## pulse_rate trig hyperlipid SmokingStatus AlcoholFreq PackYrs
## [1,] 69.67 84.93 1 0 2.37 12.89
## [2,] 57.32 -4.11 1 0 1.65 21.44
## Basal_metabolic_rate Aspirin statin father.Stroke mother.Stroke physical
## [1,] 6703.98 1 0 0 0 3363.96
## [2,] 6839.11 0 0 0 0 5489.43
## edu social diet
## [1,] 1 1 1
## [2,] 1 1 1
The proteins are assumed to follow N(0,1).
proteomics_matrix<-matrix(rnorm(n_joint*p_protein),nrow = n_joint,ncol = p_protein)
eg_proteomics<-proteomics_matrix[1:6,1:5]
colnames(eg_proteomics)<-c("aarsd1","abhd14b","abl1","acaa1","ace2")
eg_proteomics
## aarsd1 abhd14b abl1 acaa1 ace2
## [1,] -0.52351275 -0.1650609 -0.11427517 1.1924578 0.6847854
## [2,] -0.56816005 -2.3044206 -0.69394226 -2.0769323 -0.1110615
## [3,] 0.73540867 1.0189194 -1.13674962 1.1733944 0.1761003
## [4,] -0.06463737 -0.1101567 0.02631153 1.3979223 0.7898039
## [5,] 1.71409754 2.2031612 1.64312064 0.0366995 0.6623715
## [6,] 0.29790269 0.6039755 1.53544443 0.5039537 -0.3919164
We assume the disease status follows binomial distribution, and the
disease prevalence uses the similar actual case ratio (roughly 1%).
coef_risk_factor<-rep(0,length(risk_factor_names))
coef_proteomics<-rep(0,p_protein)
set.seed(2023)
coef_risk_factor[c(1,4,9,16,25)]<- 0.4
coef_proteomics[c(1,4,9,16,25)]<- -0.4
mean_case<-0.0108
Y<-rbinom(n_joint,size = 1,prob =locfit::expit(
log(mean_case/(1-mean_case))+c(scale(risk_factor_matrix)%*%coef_risk_factor)+c(proteomics_matrix%*%coef_proteomics)))
print(paste0("simulated disease prevalence:",sum(Y)/n_joint))
## [1] "simulated disease prevalence:0.02195"
Separate the simulated data into main study and external study.
main_study<-data.frame(Y = Y[1:n_main],risk_factor_matrix[1:n_main,],proteomics_matrix[1:n_main,])
external_study<-data.frame(Y = Y[-c(1:n_main)],risk_factor_matrix[-c(1:n_main),])
Scale the predictors from main study and external study.
main_study[,-1]=scale(main_study[,-1])
external_study[,-1]=scale(external_study[,-1])
Extract summary statistics from external study.
family = "binomial"
study_info=list()
external_glm=glm(Y~.,data = external_study,family = family)
study_external = list(
Coeff=external_glm$coefficients[-1],
Covariance=vcov(external_glm)[-1,-1],
Sample_size=nrow(external_study))
study_info[[1]] = study_external
#study_info
Split main study into train and test with the proportion (70% and
30%).
library(caret)
set.seed(2023)
test_index=createDataPartition(main_study$Y,p = 0.3)$Resample1
y and X are training set for main study
y = main_study$Y[-test_index]
X = as.matrix(main_study[-test_index,-1])
X<-scale(X)
y_test = main_study$Y[test_index]
X_test = as.matrix(main_study[test_index,-1])
X_test<-scale(X_test)
p_risk = ncol(external_study)-1
Perform cv.glmnet only for main study’s training set and
check the performance on main study’s test set. The criterion is
AUC.
start_t=Sys.time()
main_lasso=cv.glmnet(x=X, y=y, family=family)
end_t=Sys.time()
sprintf('cv.glmnet time: %.2f %s', end_t-start_t, units(end_t-start_t))
## [1] "cv.glmnet time: 7.15 secs"
pred_main_lasso=predict(main_lasso,newx=X_test)
library(pROC)
auc_main_lasso=auc(y_test,c(locfit::expit(pred_main_lasso)))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
Perform cv.htlgmm only for main study’s training set and
check the performance on main study’s test set.
start_t=Sys.time()
main_external_htlgmm=cv.htlgmm(y = y,
Z = X[,c(1:p_risk)],
W = X[,-c(1:p_risk)],
study_info = study_info,
family = "binomial",
A = 1,nfolds = 10,
use_sparseC = T)
end_t=Sys.time()
sprintf('cv.htlgmm time: %.2f %s', end_t-start_t, units(end_t-start_t))
## [1] "cv.htlgmm time: 21.19 secs"
Check the AUC comparison between cv.glmnet and
cv.htlgmm.
pred_main_external_htlgmm=X_test%*%main_external_htlgmm$beta[1:ncol(X_test)]
auc_main_external_htlgmm=auc(y_test,c(locfit::expit(pred_main_external_htlgmm)))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
print(paste0("AUC on test data: lasso(",round(auc_main_lasso,4) ,"); htlgmm(",round(auc_main_external_htlgmm,4),")"))
## [1] "AUC on test data: lasso(0.7253); htlgmm(0.7371)"
More details of the output
print(list("lambda_list"=main_external_htlgmm$lambda_list,
"cv_dev"=main_external_htlgmm$cv_dev,
"cv_auc"=main_external_htlgmm$cv_auc,
"lambda_min"=main_external_htlgmm$lambda_min))
## $lambda_list
## [1] 2.169188e-02 1.976483e-02 1.800898e-02 1.640911e-02 1.495137e-02
## [6] 1.362313e-02 1.241289e-02 1.131016e-02 1.030540e-02 9.389897e-03
## [11] 8.555725e-03 7.795658e-03 7.103113e-03 6.472092e-03 5.897129e-03
## [16] 5.373244e-03 4.895900e-03 4.460962e-03 4.064662e-03 3.703569e-03
## [21] 3.374554e-03 3.074768e-03 2.801614e-03 2.552726e-03 2.325949e-03
## [26] 2.119318e-03 1.931044e-03 1.759495e-03 1.603187e-03 1.460764e-03
## [31] 1.330994e-03 1.212752e-03 1.105014e-03 1.006848e-03 9.174023e-04
## [36] 8.359028e-04 7.616435e-04 6.939812e-04 6.323298e-04 5.761554e-04
## [41] 5.249713e-04 4.783343e-04 4.358404e-04 3.971216e-04 3.618424e-04
## [46] 3.296973e-04 3.004079e-04 2.737205e-04 2.494039e-04 2.272476e-04
## [51] 2.070595e-04 1.886649e-04 1.719044e-04 1.566329e-04 1.427181e-04
## [56] 1.300394e-04 1.184871e-04 1.079610e-04 9.837004e-05 8.963112e-05
## [61] 8.166854e-05 7.441333e-05 6.780266e-05 6.177925e-05 5.629096e-05
## [66] 5.129022e-05 4.673374e-05 4.258204e-05 3.879917e-05 3.535236e-05
## [71] 3.221175e-05
##
## $cv_dev
## [1] 181.6923 181.9944 182.4223 182.9217 183.4635 183.5544 182.8390 180.9765
## [9] 178.3618 175.7085 173.4402 171.5204 169.8810 168.4104 166.9158 165.1165
## [17] 162.7065 160.1565 157.5770 154.8562 152.1600 149.6250 147.3643 145.2653
## [25] 143.3362 141.5247 140.0426 138.7934 137.7547 136.7768 135.8784 135.1297
## [33] 134.5264 134.0180 133.6831 133.4495 133.3126 133.2565 133.2828 133.3811
## [41] 133.5348 133.7036 133.8852 134.0898 134.3194 134.5437 134.7678 134.9973
## [49] 135.2179 135.4324 135.6320 135.8248 136.0099 136.1897 136.3611 136.5223
## [57] 136.6731 136.8140 136.9462 137.0669 137.1752 137.2780 137.3729 137.4586
## [65] 137.5348 137.6067 137.6716 137.7334 137.7912 137.8424 137.8860
##
## $cv_auc
## [1] 0.4619772 0.4437079 0.4332497 0.4263902 0.4262204 0.4520445 0.4969185
## [8] 0.5508842 0.6020543 0.6402238 0.6614642 0.6749011 0.6829739 0.6890645
## [15] 0.6954304 0.7074099 0.7257380 0.7435218 0.7567999 0.7650456 0.7695033
## [22] 0.7712746 0.7692235 0.7675764 0.7654870 0.7626223 0.7581429 0.7536739
## [29] 0.7508302 0.7494260 0.7486425 0.7487945 0.7494286 0.7506696 0.7509974
## [36] 0.7517358 0.7525030 0.7531301 0.7534370 0.7534637 0.7531786 0.7534648
## [43] 0.7538024 0.7543078 0.7547872 0.7548822 0.7551224 0.7552095 0.7553778
## [50] 0.7553494 0.7554483 0.7554922 0.7555710 0.7555935 0.7557577 0.7558526
## [57] 0.7559110 0.7560504 0.7560377 0.7559871 0.7560774 0.7560327 0.7560929
## [64] 0.7561177 0.7560938 0.7561797 0.7562186 0.7561436 0.7561652 0.7560998
## [71] 0.7561815
##
## $lambda_min
## [1] 0.0006939812
Plot to show the change of cross validation deviance against log
lambda (tuning parameter for lasso penalty).
df = data.frame(lambda = main_external_htlgmm$lambda_list,
cv_dev=main_external_htlgmm$cv_dev,
cv_auc=main_external_htlgmm$cv_auc)
library(ggplot2)
ggplot(df)+
geom_line(aes(x=log(lambda),y=cv_dev))+
geom_vline(xintercept = log(main_external_htlgmm$lambda_min),
linetype=2)+
theme(panel.background=element_rect(fill='transparent', color='black'))+
geom_text(aes(x=log(main_external_htlgmm$lambda_min),y=median(df$cv_dev),label='lambda with min cv deviance'))
## Warning: Use of `df$cv_dev` is discouraged.
## ℹ Use `cv_dev` instead.

The real data analysis record
filter_risk=readRDS(paste0("/dcs04/nilanjan/data/rzhao/UKB/",foldname,"_risk.rds"))
print(table(filter_risk$Y))
## 0 1
## 423618 4633
head(round(filter_risk,2))
##
## Y sex age dbp sbp T2D height BMI cholesterol
## 5367276 0 1 61 71 139 0 167 31.9122 209.3822
## 5390216 0 0 64 69 120 0 170 24.0484 246.6023
## HDL LDL crp pulse_rate trig hyperlipid
## 5367276 58.37838 135.4440 0.89 56 66.37168 1
## 5390216 65.13514 156.9498 0.34 63 130.00000 1
## SmokingStatus AlcoholFreq PackYrs Basal_metabolic_rate
## 5367276 1 3 24 7778
## 5390216 0 1 0 5749
## Aspirin statin father.Stroke mother.Stroke physical edu
## 5367276 0 0 FALSE FALSE 1859.000 0
## 5390216 0 0 FALSE TRUE 2654.681 0
## social diet
## 5367276 1 2
## 5390216 1 1
proteomics0=readRDS("/dcs04/nilanjan/data/rzhao/UKB/all_proteomic_imputed.rds")
dim(proteomics0)
## [1] 48674 1459
print(proteomics0[1:6,1:5])
##
## aarsd1 abhd14b abl1 acaa1 ace2
## 1000130 -0.78070 -0.84570000 -0.66810 -0.8770 -0.17715
## 1000350 0.37550 -0.71190000 0.13610 -0.1128 0.83250
## 1000365 -0.10355 -0.96625000 -0.48105 -1.2763 -0.55790
## 1000388 0.59865 0.02825000 -1.59355 -1.2109 0.06310
## 1000437 -0.90695 -0.07920556 0.05895 2.2813 0.90340
## 1000505 0.27415 0.50850000 0.83880 0.1723 -0.08665
Match proteomics with risk factors and only take proteomics with
matched risk factors. Take main study including risk factors with
matched proteomics. And combine risk factors and proteomics to form main
study.
# only take proteomics samples with matched risk factors
proteomics0=proteomics0[which(rownames(proteomics0)
%in%rownames(filter_risk)),] \
# take risk factors with matched proteomics samples to form main study
filter_risk_main=filter_risk[which(rownames(filter_risk)%in%rownames(proteomics0)),]
# match the individuals for risk factors and proteomics
filter_risk_main=filter_risk_main[rownames(proteomics0),]
# form main study
main_study=data.frame(filter_risk_main,proteomics0)
print(table(main_study$Y))
## 0 1
## 41090 516
Take external study as the risk factors without matched
proteomics
external_study=filter_risk[!rownames(filter_risk)%in%rownames(main_study),]
print(table(external_study$Y))
## 0 1
## 382528 4117
scale the predictors from main study and external study
main_study[,-1]=scale(main_study[,-1])
external_study[,-1]=scale(external_study[,-1])
Extract summary statistics from external study
family = "binomial"
study_info=list()
external_glm=glm(Y~.,data = external_study,family = family)
study_external = list(
Coeff=external_glm$coefficients[-1],
Covariance=vcov(external_glm)[-1,-1],
Sample_size=nrow(external_study))
study_info[[1]] = study_external
Split main study into train and test.
library(caret)
set.seed(2023)
test_index=createDataPartition(main_study$Y,p = 0.3)$Resample1
y and X are training set for main study
y = main_study$Y[-test_index]
X = as.matrix(main_study[-test_index,-1])
X<-scale(X)
y_test = main_study$Y[test_index]
X_test = as.matrix(main_study[test_index,-1])
X_test<-scale(X_test)
p_risk = ncol(external_study)-1
main_lasso=cv.glmnet(x=X, y=y, family=family)
pred_main_lasso=predict(main_lasso,newx=X_test)
auc_main_lasso=auc(y_test,c(locfit::expit(pred_main_lasso)))
start_t=Sys.time()
main_external_htlgmm=cv.htlgmm(y = y,
Z = X[,c(1:p_risk)],
W = X[,-c(1:p_risk)],
study_info = study_info,
family = "binomial",
A = 1,nfolds = 5,
use_sparseC = T)
end_t=Sys.time()
sprintf('cv.htlgmm time: %.2f %s', end_t-start_t, units(end_t-start_t))
## [1] "cv.htlgmm time: 36.08 mins"
pred_main_external_htlgmm=X_test%*%main_external_htlgmm$beta[1:ncol(X_test)]
auc_main_external_htlgmm=auc(y_test,c(locfit::expit(pred_main_external_htlgmm)))
print(paste0("AUC on test data: lasso(",round(auc_main_lasso,4) ,"); htlgmm(",round(auc_main_external_htlgmm,4),")"))
[1] "AUC on test data: lasso(0.7208); htlgmm(0.7355)"
More details of the output
print(list("lambda_list"=main_external_htlgmm$lambda_list,
"cv_dev"=main_external_htlgmm$cv_dev,
"cv_auc"=main_external_htlgmm$cv_auc,
"lambda_min"=main_external_htlgmm$lambda_min))
$lambda_list
[1] 7.924390e-02 7.220409e-02 6.578967e-02 5.994510e-02 5.461974e-02
[6] 4.976748e-02 4.534627e-02 4.131783e-02 3.764727e-02 3.430279e-02
[11] 3.125543e-02 2.847878e-02 2.594880e-02 2.364358e-02 2.154315e-02
[16] 1.962932e-02 1.788550e-02 1.629660e-02 1.484886e-02 1.352973e-02
[21] 1.232778e-02 1.123262e-02 1.023474e-02 9.325516e-03 8.497063e-03
[26] 7.742208e-03 7.054411e-03 6.427717e-03 5.856696e-03 5.336403e-03
[31] 4.862332e-03 4.430376e-03 4.036793e-03 3.678176e-03 3.351417e-03
[36] 3.053686e-03 2.782405e-03 2.535224e-03 2.310002e-03 2.104787e-03
[41] 1.917804e-03 1.747432e-03 1.592195e-03 1.450748e-03 1.321868e-03
[46] 1.204437e-03 1.097438e-03 9.999446e-04 9.111122e-04 8.301715e-04
[51] 7.564214e-04 6.892230e-04 6.279943e-04 5.722050e-04 5.213719e-04
[56] 4.750547e-04 4.328521e-04 3.943987e-04 3.593614e-04 3.274368e-04
[61] 2.983482e-04 2.718438e-04 2.476939e-04 2.256895e-04 2.056398e-04
[66] 1.873714e-04 1.707258e-04 1.555590e-04 1.417396e-04 1.291478e-04
[71] 1.176747e-04 1.072208e-04 9.769558e-05 8.901657e-05
$cv_dev
[1] 1171.0360 1165.1190 1158.7579 1150.0591 1143.4851 1130.5217 1112.8933
[8] 1102.9994 1093.8668 1084.1645 1073.8451 1062.5654 1049.4282 1038.8750
[15] 1026.1266 1014.0542 1000.9688 984.2184 967.5052 955.3039 941.1043
[22] 921.3767 903.6551 881.3891 865.8054 850.2376 838.3713 830.4329
[29] 821.1622 810.4616 799.2384 790.5687 783.7159 773.3223 767.3663
[36] 759.2197 753.9895 749.8782 744.8718 742.3382 739.2882 737.5042
[43] 736.8055 733.8918 733.7895 736.4839 737.6967 740.5987 744.2263
[50] 745.6195 747.4699 748.7585 750.2006 751.4275 754.3241 756.9372
[57] 761.0621 765.0281 769.2121 772.6432 777.4776 784.3868 790.4195
[64] 797.7965 805.0640 813.2481 821.5518 831.2488 844.2613 853.1957
[71] 862.1866 870.2416 879.0524 888.3227
$cv_auc
[1] 0.5355444 0.5484380 0.5495516 0.5624143 0.5636469 0.6045397 0.6451086
[8] 0.6460113 0.6521692 0.6536681 0.6565917 0.6592563 0.6643396 0.6666150
[15] 0.6693790 0.6718747 0.6745930 0.6778317 0.6816452 0.6802493 0.6831655
[22] 0.6898077 0.6964528 0.7034943 0.7057747 0.7067143 0.7094672 0.7096894
[29] 0.7127296 0.7151948 0.7176934 0.7188960 0.7199919 0.7246034 0.7256663
[36] 0.7328785 0.7336300 0.7351701 0.7373017 0.7376359 0.7373469 0.7382812
[43] 0.7356914 0.7373801 0.7347140 0.7279085 0.7245793 0.7166923 0.7109710
[50] 0.7093706 0.7072480 0.7070194 0.7072382 0.7059920 0.7034580 0.7023161
[57] 0.6985695 0.6954391 0.6937577 0.6933914 0.6913604 0.6865806 0.6838517
[64] 0.6802693 0.6764776 0.6758094 0.6716031 0.6666118 0.6582157 0.6543859
[71] 0.6518471 0.6496574 0.6472631 0.6442306
$lambda_min
[1] 0.001321868
Plot to show the change of cross validation deviance against log
lambda (tuning parameter for lasso penalty).
df = data.frame(lambda = main_external_htlgmm$lambda_list,
cv_dev=main_external_htlgmm$cv_dev,
cv_auc=main_external_htlgmm$cv_auc)
library(ggplot2)
ggplot(df)+
geom_line(aes(x=log(lambda),y=cv_dev))+
geom_vline(xintercept = log(main_external_htlgmm$lambda_min),
linetype=2)+
theme(panel.background=element_rect(fill='transparent', color='black'))+
geom_text(aes(x=log(main_external_htlgmm$lambda_min),y=1000,label='lambda with min cv deviance'))

Plot to show the change of cross validation auc against log
lambda.
ggplot(df)+
geom_line(aes(x=log(lambda),y=cv_auc))+
geom_vline(xintercept = log(main_external_htlgmm$lambda_min),
linetype=2)+
theme(panel.background=element_rect(fill='transparent', color='black'))+
geom_text(aes(x=log(main_external_htlgmm$lambda_min),y=.6,label='lambda with min cv deviance'))
